In [5]:
x = ["duck", "aardvark", "crocodile", "emu", "bee",]
In [6]:
sorted(x)
Out[6]:
In [7]:
# Sorts the original list
x.sort()
In [8]:
x
Out[8]:
In [9]:
# Similar to order by DESC in SQL
sorted(x, reverse=True)
Out[9]:
In [10]:
# Sorted by second letter of each string: aardvard, bee, emu, crocodile, duck
# We're going to do something like this:
# sorted(x, key=???)
In [11]:
def get_second_letter(s):
return s[1]
In [12]:
get_second_letter("cheese")
Out[12]:
In [13]:
# You are passing our get_second_letter function as a parameter to your sorted() function
sorted(x, key=get_second_letter)
Out[13]:
In [14]:
type(12)
Out[14]:
In [15]:
type("hello")
Out[15]:
In [16]:
# Function is just another kind of value in Python -- therefore, you can pass them around like variables
type(get_second_letter)
Out[16]:
A alternate way of writing functions; written on a single line.
In [17]:
# Normal function:
def get_second_letter(s):
return s[1]
# The above function is pretty simple -- passing only one parameter, and get a simple return statement
get_second_letter = lambda s: s[1] # This translates EXACTLY to the function above; achieves exactly the same thing
# Note that if we had expressions other than our return statement, we could NOT use a lambda function
In [18]:
get_second_letter("hello")
Out[18]:
In [19]:
# This expression itself is a function
# This function does not need to have a variable associated with it; it doesn't need a name!
# Allows you to define a function without assigning it to a variable
# Great when you only want to use a function once
type(lambda s: s[1])
Out[19]:
In [20]:
sorted(x, key=lambda s: s[1])
# same as: sorted(x, key=get_second_letter)
Out[20]:
In [21]:
planets = [
{'diameter': 0.382,
'mass': 0.06,
'moons': 0,
'name': 'Mercury',
'orbital_period': 0.24,
'rings': 'no',
'type': 'terrestrial'},
{'diameter': 0.949,
'mass': 0.82,
'moons': 0,
'name': 'Venus',
'orbital_period': 0.62,
'rings': 'no',
'type': 'terrestrial'},
{'diameter': 1.00,
'mass': 1.00,
'moons': 1,
'name': 'Earth',
'orbital_period': 1.00,
'rings': 'no',
'type': 'terrestrial'},
{'diameter': 0.532,
'mass': 0.11,
'moons': 2,
'name': 'Mars',
'orbital_period': 1.88,
'rings': 'no',
'type': 'terrestrial'},
{'diameter': 11.209,
'mass': 317.8,
'moons': 67,
'name': 'Jupiter',
'orbital_period': 11.86,
'rings': 'yes',
'type': 'gas giant'},
{'diameter': 9.449,
'mass': 95.2,
'moons': 62,
'name': 'Saturn',
'orbital_period': 29.46,
'rings': 'yes',
'type': 'gas giant'},
{'diameter': 4.007,
'mass': 14.6,
'moons': 27,
'name': 'Uranus',
'orbital_period': 84.01,
'rings': 'yes',
'type': 'ice giant'},
{'diameter': 3.883,
'mass': 17.2,
'moons': 14,
'name': 'Neptune',
'orbital_period': 164.8,
'rings': 'yes',
'type': 'ice giant'}]
In [22]:
[p['name'] for p in sorted(planets, key=lambda x: x['moons'])]
Out[22]:
In [23]:
def get_moon_count(d):
return d['moons']
[p['name'] for p in sorted(planets, key=get_moon_count)]
Out[23]:
What it would look like in SQL:
SELECT name FROM planets
ORDER BY moons
In [24]:
[p['name'] for p in sorted(planets, key=lambda d: d['diameter'], reverse=True)]
Out[24]:
Aside: putting
\
in a python line allows you to write the rest of what you were writing on the next line!
In [25]:
[p['name'] for p in \
sorted(planets, key=lambda d: d['diameter'], reverse=True) \
if p['diameter'] > 4]
Out[25]:
Tuple is kind of like a strict list. It looks kind of like a list.
In [26]:
t = (5, 10, 15)
In [27]:
type(t)
Out[27]:
In [28]:
t[0]
Out[28]:
In [29]:
for item in t:
print(item * item)
The main difference between a tuple and a list: you can't add new values to a tuple.
In [30]:
t.append(30)
In [31]:
carefree_list = [5, 10, 15, 20, 25]
In [32]:
carefree_list
Out[32]:
In [33]:
carefree_list.append(30)
In [34]:
carefree_list
Out[34]:
In [35]:
carefree_list[1] = "Mr. Fluffypants"
In [36]:
carefree_list
Out[36]:
A tuple is called an immutable data type. You can't change it after it has been defined!
Benefits of using tuples:
In [37]:
# Immutable data type
# You can't change it after it has been defined
t[1] = "Mr. Fluffypants"
In [38]:
hello = [1, 2, 3]
In [39]:
foo = (1, 2, 3)
In [40]:
import sys
sys.getsizeof(hello)
Out[40]:
In [41]:
# Our tuple take up less memory!
sys.getsizeof(foo)
Out[41]:
In most applications, we want lists because we're working with data. We want to be able to append, delete or otherwise modify our data. A list is like a notebook and a tuple is stone tablet.
The standard library will have functions that return data structures as tuples (because it has an interest in efficiency).
In [42]:
import re
test = "one 1 two 2 three 3 four 4 five 5"
# we want every occurence of a word followed by a number
# .findall() finds every occurence of the regular expression in our string
re.findall(r"\w+ \d", test)
Out[42]:
In [43]:
# We want to access one and 1 individually
for item in re.findall(r"\w+ \d", test):
print(number_s)
print(number_i)
In [44]:
# This particular example is trivial because we know we can do this:
for item in re.findall(r"\w+ \d", test):
x = item.split(" ")
print(x[0])
print(x[1])
In [45]:
test = "one 1 two 2 three 3 four 4 five 5"
re.findall(r"(\w+) (\d)", test)
# returns a list of tuples
Out[45]:
In [46]:
all_subjects = open("enronsubjects.txt").read()
In [47]:
re.findall(r"\d{3}-\d{3}-\d{4}", all_subjects)
Out[47]:
In [48]:
# What if we wanted to grab them as separate items?
re.findall(r"(\d{3})-(\d{3})-(\d{4})", all_subjects)
Out[48]:
In [49]:
for item in re.findall(r"(\d{3})-(\d{3})-(\d{4})", all_subjects):
print(item[0])
In [50]:
[item[0] for item in re.findall(r"(\d{3})-(\d{3})-(\d{4})", all_subjects)]
Out[50]:
In [51]:
# dollar sign, followed by some number of digits, followed by 0 or 1 space(s), followed by some number of alphanumberic chracters
r"\$(\d+) ?(\w+)"
Out[51]:
In [52]:
re.findall(r"\$(\d+) ?(\w+)", all_subjects)
Out[52]:
In [53]:
# Let's refine our search
re.findall(r"\$(\d+) ?([bBmM])", all_subjects)
Out[53]:
In [54]:
# Add up all the values mentioned in the subject lines
vals = []
for item in re.findall(r"\$(\d+) ?([bBmM])", all_subjects):
multiplier = item[1].lower()
number_val = int(item[0])
if multiplier == 'k':
number_val *= 1000
elif multiplier == 'm':
number_val *= 1000000
elif multiplier == 'b':
number_val *= 1000000000
vals.append(number_val)
sum(vals)
Out[54]:
The point: re.findall() returns a list of tuples because the number of groups that will be returned will not be changed.
In [55]:
message = "This is a test, this is only a test"
In [56]:
# You can chain .replace()
message.replace("this", "that").replace("test", "walrus")
Out[56]:
In [57]:
re.findall(r"\d{3}-\d{3}-\d{4}", all_subjects)
Out[57]:
In [58]:
message = "This is a test, this is only a test"
In [59]:
# re.sub(???, ???, ???): takes 3 parameters
# word to find, the word to replace found word with, a string
re.sub(r"[Tt]his", "that", message)
Out[59]:
In [60]:
message
Out[60]:
In [61]:
re.sub(r"\b\w+\b", "walrus", message)
Out[61]:
In [62]:
# You could also pass a function as the second parameter
# re.sub(r"\b\w+\b", function, message)
In [63]:
anon = re.sub(r"\d{3}-\d{3}-\d{4}", "555-555-5555", all_subjects)
In [64]:
re.findall(r"\d{3}-\d{3}-\d{4}", anon)
Out[64]:
In [65]:
# Find twenty characters before and after our phone numbers
re.findall(r".{,20}\d{3}-\d{3}-\d{4}.{,20}", anon)
Out[65]:
In [66]:
anon2 = re.sub(r"(\d{3})-(\d{3})-(\d{4})", r"\1-\2-XXXX", anon)
In [67]:
re.findall(r".{,20}\d{3}-\d{3}-X{4}.{,20}", anon2)
Out[67]:
We've done a lot of work getting data out of databases, but SQL databases are also a good place to store your data.
In [68]:
from urllib.request import urlretrieve
urlretrieve("https://raw.githubusercontent.com/ledeprogram/data-and-databases/master/menupages-morningside-heights.html",
"menupages-morningside-heights.html")
Out[68]:
<tr>
that is a child of the <table>
tag with class search-results
<td>
tags with class name-address
<a>
tag inside that <td>
<span>
inside a <td>
with class price
<td>
tag that has no class. 5th <td>
tag that is a child of the restaurant's <tr>
taga list of dictionary (this will be our intermediary format before we make an actual SQL table)
[
{ "name": "Brad's", "price": "1", "cuisine": '['coffee'] },
{ "name": "Cafe Nana", "price": "0", "cuisine": '['Middle Eastern', 'Kosher'] },
...
]
In [69]:
from bs4 import BeautifulSoup
In [70]:
raw_html = open("menupages-morningside-heights.html").read() # read in as a string
soup = BeautifulSoup(raw_html, "html.parser") # parse it as html
In [71]:
search_table = soup.find("table", {"class": "search-results"})
table_body = search_table.find("tbody")
for tr_tag in table_body.find_all('tr'):
print(tr_tag) # Print to make sure your code is working as you hope!
print("______________________\n") # I like using separators to make prints easier to process!
In [72]:
# Get the restaurant names
search_table = soup.find("table", {"class": "search-results"})
table_body = search_table.find("tbody")
for tr_tag in table_body.find_all('tr'):
name_address_tag = tr_tag.find("td", {"class": "name-address"})
a_tag = name_address_tag.find("a")
print(a_tag.string)
In [73]:
# Getting names AND prices
search_table = soup.find("table", {"class": "search-results"})
table_body = search_table.find("tbody")
for tr_tag in table_body.find_all('tr'):
# Get the restaurant name from the <a> inside a <td>
name_address_tag = tr_tag.find("td", {"class": "name-address"})
a_tag = name_address_tag.find("a")
restaurant_name = a_tag.string
# Get the price from <span> if present
price_tag = tr_tag.find("td", {"class": "price"})
price_span_tag = price_tag.find("span")
if price_span_tag:
price = int(price_span_tag.string)
else:
price = 0 #some of our restaurants don't have a span tag!
print(restaurant_name, price)
Step 1: Writing a framework for our functions and some test code
In [74]:
# Write the framework of a function first and then
def get_name(tr_tag):
return "TEST RESTAURANT"
def get_price(tag):
return "999999"
# This code will allow us to TEST our functions! Our function frameworks return values of the right type.
search_table = soup.find("table", {"class": "search-results"})
table_body = search_table.find("tbody")
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
print(restaurant_name, price)
# Yay, it's bring what we told it to.
Step 2: Writing our get_names() function
Copying the code from our for loop
In [75]:
def get_name(tr_tag):
name_address_tag = tr_tag.find("td", {"class": "name-address"})
a_tag = name_address_tag.find("a")
restaurant_name = a_tag.string
return restaurant_name
def get_price(tag):
return "999999"
# This code will allow us to TEST our functions! Our function frameworks return values of the right type.
search_table = soup.find("table", {"class": "search-results"})
table_body = search_table.find("tbody")
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
print(restaurant_name, price)
Step 3: Writing our get_price() function
Again, just copying the code from our for loop
In [76]:
# Our functions:
def get_name(tr_tag):
name_address_tag = tr_tag.find("td", {"class": "name-address"})
a_tag = name_address_tag.find("a")
restaurant_name = a_tag.string
return restaurant_name
def get_price(tr_tag):
price_tag = tr_tag.find("td", {"class": "price"})
price_span_tag = price_tag.find("span")
if price_span_tag:
price = int(price_span_tag.string)
else:
price = 0
return price
# Looping through all the restaurants:
search_table = soup.find("table", {"class": "search-results"})
table_body = search_table.find("tbody")
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
print(restaurant_name, price)
Step 4: Setting up our get_cuisines() function
We're writing a dummy function just to test our code basically works.
In [77]:
def get_cuisines(tr_tag):
# we want to return a list of strings
# SOMEDAY HAVE GOOD CODE HERE
# dummy return to test it
return ['stuff', 'blah', 'etc']
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
cuisines = get_cuisines(tr_tag)
print(restaurant_name, price, cuisines)
Step 5: Writing our get_cuisines() function
In [78]:
def get_cuisines(tr_tag):
all_td_tags = tr_tag.find_all("td")
cuisine_tag = all_td_tags[4]
cuisines = cuisine_tag.string
cuisines_list = cuisines.split(", ")
return cuisines_list
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
cuisines = get_cuisines(tr_tag)
print(restaurant_name, price, cuisines)
# We run into an error!
Step 6: Debugging our error with strategic print statements
In [79]:
def get_cuisines(tr_tag):
all_td_tags = tr_tag.find_all("td")
cuisine_tag = all_td_tags[4]
print(cuisine_tag) # debugging strategy to see what is causing our code to break
cuisines = cuisine_tag.string
cuisines_list = cuisines.split(", ")
return cuisines_list
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
print(restaurant_name) # debugging strategy to see what restaurant is causing our code to break
cuisines = get_cuisines(tr_tag)
print(restaurant_name, price, cuisines)
# sometimes of <td> tag is empty!
Step 7: Correcting get_cuisines() function with if/else condition for NoneTypes
In [80]:
def get_cuisines(tr_tag):
all_td_tags = tr_tag.find_all("td")
cuisine_tag = all_td_tags[4]
cuisines = cuisine_tag.string
if cuisines:
cuisines_list = cuisines.split(", ")
else:
cuisines_list = []
return cuisines_list
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
cuisines = get_cuisines(tr_tag)
print(restaurant_name, price, cuisines)
# Woohoo, everything prints like we'd want it to!
Step 8: Saving our data to a list of dictionaries
We want to do more than just print! The data structure of a list of dictionaries is pretty flexible -- we can go to a pandas DataFrame, a csv, and someday, a SQL table from there.
In [152]:
restaurants = []
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
cuisines = get_cuisines(tr_tag)
rest_dict = {'name': restaurant_name, 'price': int(price), 'cuisines': cuisines}
restaurants.append(rest_dict)
restaurants
Out[152]:
Step 9: pandas DataFrame!
In [85]:
import pandas as pd
df = pd.DataFrame(restaurants)
In [86]:
df
Out[86]:
"schema" ---> designing the tables
why we need more than one table? The way how relational data works. You have restaurants that has more than one cousine. We want to be able to able to do data normalization, which is a term use in relational data base. Its about separate data so any piece of data only appears one time.
"entities" ---> the thing you are storing
Restaurant table: id
{unique integer identifying the restaurant}
name
string wirh restaurant name
price
integer that correspoonds to the number of dollar sings
kind
string that identifies the cuisines type itself
cuisines table:
restaurant_id
number associated with the restaurant)
kind
Sample entry from restaurant table id: 4 name: brad's
sample entry from cuisines table restaurant_id: 4
restaurant_id:4 kind: seafood
"setup phae" creating database and creating tables "one time" --> psql "working with data phase" --> inserting records, selecting stuff ---> python
sql data types like
In [139]:
import pg8000
conn = pg8000.connect(database="menupages")
In [140]:
type(conn)
Out[140]:
In [163]:
conn.rollback() #execute this whenever toy make a SQL problem
In [142]:
cursor = conn.cursor()
cursor objects:
In [119]:
cursor.execute("INSERT INTO restaurant (name, price) VALUES ('good Food Places', 3)")
conn.commit()
In [120]:
cursor.execute("select * from restaurant")
for item in cursor.fetchall():
print(item)
In [121]:
cursor.execute("INSERT INTO restaurant (name, price) VALUES ('Palace of Vegan Nosh', 3) returning id")
results = cursor.fetchone()
conn.commit()
In [122]:
results
Out[122]:
In [123]:
rowld = results[0]
In [124]:
rowld
Out[124]:
In [125]:
#WILL NOT WORK
cursor.execute("INSERT INTO restaurant (name, price) VALUES ('Brad's', 3) returning id")
results = cursor.fetchone()[0]
conn.commit()
In [ ]:
#SQL injection attack
restaurant = "'Restaurant'); Delete from restaurant;"
string in python --> "quote" "escape" ---> valid sql statement
#very weird difficult and arquade
In [147]:
rest_insert = "INSERT INTO restaurant (name, price) VALUES (%s, %s)"
cursor.execute(rest_insert, ["Brad's", 1])
# pg8000 does the work: "INSERT INTOrestaurant (name, price) values ('Brad\'\'s', 1)"
conn.commit()
In [144]:
cursor.execute("insert into restaurant (name, price) values (%s, %s) returning id", ["Test Restaurant", 2])
rowid = cursor.fetchone()[0]
conn.commit()
In [146]:
#let's say Test Restaunrant servers fondue and casseroles
cuisine_insert = "insert into cuisine (restaurant_id, kind) values (%s, %s)"
cursor.execute(cuisine_insert, [rowid, "fondue"])
cursor.execute(cuisine_insert, [rowid, 'casseroles'])
conn.commit
Out[146]:
In [150]:
rest_insert = "insert into restaurant (name, price) values (%s, %s)"
for item in restaurants:
#execute sql statement with data from the restaurant!
cursor.execute(rest_insert, [item['name'], item['price']])
conn.commit
In [153]:
first = restaurants[0]
first
Out[153]:
In [154]:
print(first ['name'])
In [155]:
type(first['name'])
Out[155]:
In [157]:
rest_insert = "insert into restaurant (name, price) values (%s, %s)"
for item in restaurants:
#execute sql statement with data from the restaurant!
cursor.execute(rest_insert, [str(item['name']), item['price']])
conn.commit()
In [158]:
cursor.execute("insert into restaurant (name, price) values (%s, %s) returning id", ["Test Restaurant", 2])
rowid = cursor.fetchone()[0]
conn.commit()
In [164]:
conn.rollback()
In [159]:
## /Inserting voth restaunrants and their cuisiness
In [165]:
restaurants
Out[165]:
In [166]:
rest_insert = "insert into restaurant (name, price) values (%s, %s) returning id"
cuisine = "insert into cuisine (restaurant_id, kind) values (%s, %s)"
for item in restaurants:
#execute sql statement with data from the restaurant!
cursor.execute(rest_insert, [str(item['name']), item['price']])
rowid = cursor.fetchone()[0]
for cuisine in item['cuisines']:
print(" - inserting cuisine", cuisine)
cursor.execute(cuisine_insert, [rowid, str(cuisine)])
# insert restaunrant_id, cuisine kind into cuisine table
conn.commit()
In [ ]:
poem = """Whose woods these are I think I know
His house is in the village though"""